{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 使用 LSTM 来自动生成周杰伦歌词的简单示范\n",
    "\n",
    "在这一个章节中，我们介绍使用 PyTorch 来处理序列化的数据，比如句子。\n",
    "\n",
    "我们直接使用 nn.LSTM 来模拟生成周杰伦歌词。\n",
    "\n",
    "对于 RNN 理解可以参考前一篇 [RNN](https://github.com/LianHaiMiao/pytorch-lesson-zh/blob/master/basis/rnn.ipynb)\n",
    "\n",
    "PS: 数据来源：[李沐-GLUON-循环神经网络 — 从0开始](https://zh.gluon.ai/chapter_recurrent-neural-networks/rnn-scratch.html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch.autograd import Variable\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import os\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class Corpus(object):\n",
    "    \"\"\"\n",
    "        构建语料库的类\n",
    "        path: 文件路径\n",
    "    \"\"\"\n",
    "    def __init__(self, path):\n",
    "        self.path = path\n",
    "        self.char2id = {}\n",
    "        self.id2char = {}\n",
    "        self.corpus_indices = None\n",
    "    def get_data(self):\n",
    "        with open(self.path, 'r', encoding='utf8') as f:\n",
    "            chars = f.read()\n",
    "        chars_list = chars.replace('\\n', ' ').replace('\\r', ' ')\n",
    "        # 开始创建索引 word 2 id\n",
    "        idx = 0\n",
    "        for char in chars_list:\n",
    "            if not char in self.char2id:\n",
    "                self.char2id[char] = idx\n",
    "                self.id2char[idx] = char\n",
    "                idx += 1\n",
    "        # 将 corpus 里面的 char 用 index表示\n",
    "        self.corpus_indices = [self.char2id[char] for char in chars_list]\n",
    "    \n",
    "    # 获取 corpus 的长度\n",
    "    def __len__(self):\n",
    "        return len(self.char2id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 构建 Config 类，用于控制超参数\n",
    "class Config(object):\n",
    "    def __init__(self):\n",
    "        self.embed_size = 128 # embedding size\n",
    "        self.hidden_size = 1024 # RNN中隐含层的 size\n",
    "        self.num_layers = 1 # RNN 中的隐含层有几层，我们默认设置为 1层\n",
    "        self.epoch_num = 50 # 训练迭代次数\n",
    "        self.sample_num = 10 # 随机采样\n",
    "        self.batch_size = 32 # batch size\n",
    "        self.seq_length = 35 # seq length\n",
    "        self.lr = 0.002 #learning rate\n",
    "        self.path = \"./LSTM/jaychou_lyrics.txt\" # 歌词数据集\n",
    "        self.prefix = ['分开', '战争中', '我想'] # 测试阶段，给定的前缀，我们用它来生成歌词\n",
    "        self.pred_len = 50 # 预测的字符长度\n",
    "        self.use_gpu = True\n",
    "        \n",
    "config = Config()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 这里简单一些，我们直接用一个函数来作为迭代器生成训练样本\n",
    "def getBatch(corpus_indices, batch_size, seq_length, config):\n",
    "    data_len = len(corpus_indices)\n",
    "    batch_len = data_len // config.batch_size\n",
    "    corpus_indices = torch.LongTensor(corpus_indices)\n",
    "    # 将训练数据的 size 变成 batch_size x seq_length\n",
    "    indices = corpus_indices[0: batch_size * batch_len].view(batch_size, batch_len)\n",
    "    for i in range(0, indices.size(1) - seq_length, seq_length):\n",
    "        input_data = Variable(indices[:, i: i + seq_length])\n",
    "        target_data = Variable(indices[:, (i + 1): (i + 1) + seq_length].contiguous())\n",
    "        # use GPU to train the model\n",
    "        if config.use_gpu:\n",
    "            input_data = input_data.cuda()\n",
    "            target_data = target_data.cuda()\n",
    "        yield(input_data, target_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将当前的状态从计算图中分离，加快训练速度\n",
    "def detach(states):\n",
    "    return [state.detach() for state in states] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义 LSTM 模型\n",
    "class lstm(nn.Module):\n",
    "    # input: \n",
    "    # x: 尺寸为 batch_size * seq_length 矩阵。\n",
    "    # hidden: 尺寸为 batch_size * hidden_dim 矩阵。\n",
    "    # output:\n",
    "    # out: 尺寸为 batch_size * vocab_size 矩阵。\n",
    "    # h: 尺寸为 batch_size * hidden_dim 矩阵。\n",
    "    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):\n",
    "        super(lstm, self).__init__()\n",
    "        self.embed = nn.Embedding(vocab_size, embed_size)\n",
    "        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)\n",
    "        self.linear = nn.Linear(hidden_size, vocab_size)\n",
    "        self.init_weights()\n",
    "        \n",
    "    def forward(self, x, hidden):\n",
    "        embeds = self.embed(x)\n",
    "        \n",
    "        out, hidden = self.rnn(embeds, hidden)\n",
    "\n",
    "        out = out.contiguous().view(out.size(0)*out.size(1), -1) # out 的 size 变成 (batch_size*sequence_length, hidden_size)\n",
    "        \n",
    "        out = self.linear(out) # (batch_size*sequence_length, hidden_size) -> (batch_size*sequence_length, vocab_size)\n",
    "        return out, hidden\n",
    "    \n",
    "    def init_weights(self):\n",
    "        self.embed.weight = nn.init.xavier_uniform(self.embed.weight)\n",
    "        self.linear.bias.data.fill_(0)\n",
    "        self.linear.weight = nn.init.xavier_uniform(self.linear.weight)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 注意事项\n",
    "\n",
    "在模型的初始化参数中，我们看到了 num_layers 这个参数，表示的含义是我们设置多少层隐含层。通常我们设置一层，当然也可以根据自己的意愿设置多层。\n",
    "\n",
    "当 num_layers = 2 时，如图所示：\n",
    "\n",
    "![num_layers = 2](./LSTM/img/num_layers.png)\n",
    "\n",
    "\n",
    "**在训练过程中，我们会使用一个叫 detach 的函数** 这个函数的主要作用就是将隐含状态从计算图中分离出来。\n",
    "\n",
    "假设，我们现在训练到了第 $t+1$ 个 batch， 那么这次训练过程中的隐含层输入是第 $t$ 个batch的输出隐含层状态，为了让模型参数的梯度计算只依赖于当前的批量序列，从而减小每次迭代的计算开销，我们可以使用detach函数来将隐含状态从计算图分离出来。\n",
    "\n",
    "如果上面的话太拗口，一句话简单解释 detach 的作用就是： **在 RNN 模型中，这样做可以加速运算。**\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 构建语料库\n",
    "corpus = Corpus(config.path)\n",
    "# 处理 data\n",
    "corpus.get_data()\n",
    "# 模型初始化\n",
    "lstm = lstm(len(corpus), config.embed_size, config.hidden_size, config.num_layers)\n",
    "# 使用 gpu\n",
    "if config.use_gpu:\n",
    "    lstm = lstm.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loss and Optimizer\n",
    "criterion = nn.CrossEntropyLoss()\n",
    "optimizer = torch.optim.Adam(lstm.parameters(), lr=config.lr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 使用训练好的 LSTM 在给定前缀的前提下，自动生成歌词。\n",
    "def predict(model, prefix, config, corpus):\n",
    "    \"\"\"\n",
    "        model 是模型， prefix是前缀， config 是参数类， corpus是语料库类\n",
    "    \"\"\"\n",
    "    state_h = Variable(torch.zeros(config.num_layers, 1, config.hidden_size)) # 起始的hidden status\n",
    "    state_c = Variable(torch.zeros(config.num_layers, 1, config.hidden_size)) # 起始的cell status\n",
    "    \n",
    "    # use gpu\n",
    "    if config.use_gpu:\n",
    "        state_h = state_h.cuda()\n",
    "        state_c = state_c.cuda()\n",
    "    # become a tuple\n",
    "    state = (state_h, state_c)\n",
    "    output = [corpus.char2id[prefix[0]]]\n",
    "    for i in range(config.pred_len + len(prefix)):\n",
    "        X = Variable(torch.LongTensor(output)).unsqueeze(0)\n",
    "        # use gpu\n",
    "        if config.use_gpu:\n",
    "            X = X.cuda()\n",
    "        Y, state = model(X, state)\n",
    "        # 我们将结果变成概率，选择其中概率最大的作为预测下一个字符\n",
    "        prob = Y.data[0].exp()\n",
    "        word_id = torch.multinomial(prob, 1)[0]\n",
    "        if i < len(prefix) - 1:\n",
    "            next_char = corpus.char2id[prefix[i+1]]\n",
    "        else:\n",
    "            next_char = int(word_id)\n",
    "        output.append(next_char)\n",
    "    print(\"\".join([corpus.id2char[id] for id in output]))\n",
    "    return "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 0. Perplexity 536.237761\n",
      "分开迎天过爱脚的半正去 寒l语  过梦文状素太上草的在在里衡与事安照 飘宗 西去外常的忆于活过功差的斯烧弹\n",
      "战争中的的夫一南的弟猴清觉喜瑚掬墙走过朝哟你颜标不 贴马身功在驚杰 清狗朧像慰　幕这钱喘一帮 为夫摇运永灰 \n",
      "我想哦想泪绪 咿无  婆一   花忆加持你的异专的你太 要飘统給到说一  無 的广四  双不喜的 永守 溪\n",
      "Epoch 10. Perplexity 5.840660\n",
      "分开觉纵手不回为忘历里 在旧无都的分下在好发在夕张里坠在来面看睡最 就来开手遍崖的地手 游开还心累的开道不\n",
      "战争中光的不檐 疤默入` 兮杰度恰林  色腰候运瞑片蛮水叶a跑 e间 作伤声婪 之邦 腰谢兽 院巷 腰北情式\n",
      "我想不舍  要故 有的爱生在的诗娘静完的还会 写在开朝全没来环  那们就不的绿说知试  们脸们的的们就在已\n",
      "Epoch 20. Perplexity 1.176746\n",
      "分开寸手来着到手才喜旧 穿生随明关开的变开你带接镜脸的的绪身 跑开开在情面有感命 开去水念说爱拳长 的去不\n",
      "战争中落头 诉绪制静有南容水许落契 剧上国子 伤指在白场 洁具香也下下落  绪绪一朝入内 飘满头眼夜  檐制\n",
      "我想给开  掉知在的面们 陪会要知的 没变会在只知怎没决相  会不踏一接一用们ㄟ说说开数 的的生只们们更睡\n",
      "Epoch 30. Perplexity 1.028150\n",
      "分开记难的新新候去 来身手示不于打应为关手 降里的的爱海  开手在飘手上留的在头方动头来下想来 想慢不长一\n",
      "战争中答制 腰绪速继量 张字头里在眼平  慈持中支子 意命鱼夜续间命支蔓  兴头拍蓝吼照失归的 作角 的上丽\n",
      "我想是睡 的将想只要甘女可的代  也也不有有们的的们不  不知再有 受轻的 默会里的的们 的轻们的的们  \n",
      "Epoch 40. Perplexity 1.019534\n",
      "分开下 的也生上回的  绪开开的可开就动手手将心套 美来开猜不可来手 的想下 可开喜也还去定来笔的的想  \n",
      "战争中a色 迹色里夜的道绪声  视制中腰毛腰 声字过月 入命 虹露聆杀空 色牙中卡的球头  烟足林焚 灯惯易\n",
      "我想睡  不拥再 一再一的 一变不变在  用不全有们们 睡不在  轻就的知牵 去们也都不永损就 的已那 变\n"
     ]
    }
   ],
   "source": [
    "# 开始训练\n",
    "for epoch in range(config.epoch_num):\n",
    "    # 由于使用的是 lstm，我们初始化 hidden status 和 cell\n",
    "    state_h = Variable(torch.zeros(config.num_layers, config.batch_size, config.hidden_size)) # 起始的hidden status\n",
    "    state_c = Variable(torch.zeros(config.num_layers, config.batch_size, config.hidden_size)) # 起始的cell status\n",
    "    # use gpu\n",
    "    if config.use_gpu:\n",
    "        state_h = state_h.cuda()\n",
    "        state_c = state_c.cuda()\n",
    "    \n",
    "    hidden = (state_h, state_c)\n",
    "    \n",
    "    train_loss = [] # 训练的总误差\n",
    "    \n",
    "    for i,batch in enumerate(getBatch(corpus.corpus_indices, config.batch_size, config.seq_length, config)):\n",
    "        inputs, targets = batch\n",
    "        # Forward + Backward + Optimize\n",
    "        lstm.zero_grad()\n",
    "        hidden = detach(hidden)\n",
    "        \n",
    "        outputs, hidden = lstm(inputs, hidden)\n",
    "\n",
    "        loss = criterion(outputs, targets.view(-1))\n",
    "        train_loss.append(loss.data[0])\n",
    "        loss.backward()\n",
    "        torch.nn.utils.clip_grad_norm(lstm.parameters(), 0.5) # 梯度剪裁\n",
    "        optimizer.step()\n",
    "    # 采样，进行预测\n",
    "    if epoch % config.sample_num == 0:\n",
    "        print(\"Epoch %d. Perplexity %f\" % (epoch, np.exp(np.mean(train_loss))))\n",
    "        # 对给定的歌词开头，我们自动生成歌词\n",
    "        for preseq in config.prefix:\n",
    "            predict(lstm, preseq, config, corpus)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这里只是简单的训练了 50 个 epoch，多次训练，可以看得出来，他们勉强有点... 像个句子了，尽管意思还是很模糊。"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
